Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())MultiAssayExperiment objectlibrary(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)
tmp <- fs::dir_map("R scripts/", source)
theme_set(theme_light())The purpose of this document is to create a MultiAssayExperiment object containing all assays and clinical data generated in the context of the VIBRANT trial at the participant x visit resolution (which means that duplicates were aggregated in previous steps such that for a given assay/variable, there is a single value for each participant and visit combination).
The document is organised as follow:
First, we load and minimally transform the “clinical” data (i.e., data collected through CRFs).
Next, we load and minimally transform the SummarizedExperiment objects for each assay (metagenomics, qPCR, 16S amplicon sequencing, Luminex, and flow cytometry).
Finally, we create the MultiAssayExperiment object by combining the clinical data and assay data. This entails several steps, including creating a mae_colData table that combines the clinical data and assay sample information, ensuring that the uids (unique identifiers) are consistent across all data tables.
At the end, we document which assays are available (= have data) for which participants and visits.
crf_files <-
get_01_output_dir() |>
fs::dir_ls() |>
str_subset("/01_")
crf_clean_file <- crf_files |> str_subset("01_crf_clean") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_raw_file <- crf_files |> str_subset("01_crf_raw") |> sort(decreasing = TRUE) |> magrittr::extract(1)
crf_dict_file <- crf_files |> str_subset("01_crf_plates_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1)
load(crf_clean_file, verbose = TRUE)Loading objects:
crf_clean
load(crf_raw_file, verbose = TRUE)Loading objects:
crf_raw
load(crf_dict_file, verbose = TRUE)Loading objects:
crf_plates_dictionary
rm(crf_clean_file, crf_raw_file, crf_dict_file)Participants tableload(
crf_files |> str_subset("01_participants_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participants
load(
crf_files |> str_subset("01_participants_variable_dictionary") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participants_variable_dictionary
load(
crf_files |> str_subset("01_participant_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
participant_crfs_merged
Visits tableload(
crf_files |> str_subset("01_visits_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits
load(
crf_files |> str_subset("01_visits_long_2025") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits_long
load(
crf_files |> str_subset("01_visits_crfs_merged") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
visits_crfs_merged
visits |>
ggplot() +
aes(x = study_day, y = pid |> factor(), color = visit_code |> factor()) +
geom_point() We define the uid (unique identifier):
visits <-
visits |>
mutate(
uid = str_c(pid, visit_code, sep = "_")
) |>
select(uid, everything())Exposures tableload(
crf_files |> str_subset("01_exposures_") |> sort(decreasing = TRUE) |> magrittr::extract(1),
verbose = TRUE
)Loading objects:
exposures
SummarizedExperiment objectsdata_dir <- get_01_output_dir()
se_files <- fs::dir_ls(data_dir, regexp = "se_.*\\.rds$") |> sort(decreasing = TRUE)mg_file <- se_files[grepl("mg", se_files)][1]
se_mg <- readRDS(mg_file)
se_mg <- check_se(se_mg)
se_mg# A SummarizedExperiment-tibble abstraction: 775,884 × 58
# Features=779 | Samples=996 | Assays=count, count_corr, rel_ab_all,
# rel_ab_bact, rel_ab
.feature .sample count count_corr rel_ab_all rel_ab_bact rel_ab uid mg_uid
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 C0022A1 0681000… 0 0 0 0 0 0681… MG_202
2 C0059E1 0681000… 0 0 0 0 0 0681… MG_202
3 C0175A1 0681000… 0 0 0 0 0 0681… MG_202
4 FF00018 0681000… 0 0 0 0 0 0681… MG_202
5 FF00051 0681000… 0 0 0 0 0 0681… MG_202
6 UC101 0681000… 0 0 0 0 0 0681… MG_202
7 122010 0681000… 0 0 0 0 0 0681… MG_202
8 185329 0681000… 0 0 0 0 0 0681… MG_202
9 C0006A1 0681000… 0 0 0 0 0 0681… MG_202
10 C0028A1 0681000… 0 0 0 0 0 0681… MG_202
# ℹ 40 more rows
# ℹ 49 more variables: sample_type <fct>, control_type <fct>,
# total_non_human_reads <dbl>, rel_ab_multi_genera <dbl>,
# poor_quality_mg_data <lgl>, cst <chr>, sub_cst <chr>, valencia_score <dbl>,
# swab_barcode <chr>, mg_pid <chr>, mg_visit_code <chr>,
# visit_code_flagged <lgl>, mg_sample_type <chr>, ext_lib_plate_nb <dbl>,
# ext_lib_plate_id <dbl>, ext_lib_position <chr>, …
qPCR_file <- se_files[grepl("03_se_pcr_agg", se_files)][1]
se_qPRC <- readRDS(qPCR_file)
se_qPRC <- check_se(se_qPRC)
se_qPRC# A SummarizedExperiment-tibble abstraction: 16,464 × 26
# Features=16 | Samples=1029 | Assays=n_non_na, dilution, copies_per_swab_med,
# copies_per_swab_mean, copies_per_swab_cv, copies_per_swab_orig_med,
# copies_per_swab_LL3_med, copies_per_swab_LM_med
.feature .sample n_non_na dilution copies_per_swab_med copies_per_swab_mean
<chr> <chr> <int> <dbl> <dbl> <dbl>
1 C0175A1 06810000… 3 5 0 0
2 C0022A1 06810000… 3 5 0 0
3 C0059E1 06810000… 3 5 0 0
4 UC101 06810000… 3 5 0 0
5 FF00018 06810000… 3 5 0 0
6 FF00051 06810000… 3 5 0 0
7 185329 06810000… 3 20 0 0
8 C0028A1 06810000… 3 20 0 0
9 C0112A1 06810000… 3 20 0 0
10 FF00004 06810000… 3 20 0 0
# ℹ 310 more rows
# ℹ 20 more variables: copies_per_swab_cv <dbl>,
# copies_per_swab_orig_med <dbl>, copies_per_swab_LL3_med <dbl>,
# copies_per_swab_LM_med <dbl>, uid <chr>, qpcr_sample_type <fct>,
# sample_type <chr>, control_type <chr>, qpcr_sample_id <chr>,
# vibr_sample_id <chr>, ext_lib_plate_nb <chr>, target <fct>, fluor <chr>,
# strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>, …
daily_qPCR_file <- se_files[grepl("30_se_pcr_agg", se_files)][1]
se_daily_qPRC <- readRDS(daily_qPCR_file)
se_daily_qPRC <- check_se(se_daily_qPRC)
se_daily_qPRC# A SummarizedExperiment-tibble abstraction: 47,840 × 24
# Features=16 | Samples=2990 | Assays=n_non_na, dilution, copies_per_swab_med,
# copies_per_swab_mean, copies_per_swab_cv
.feature .sample n_non_na dilution copies_per_swab_med copies_per_swab_mean
<chr> <chr> <int> <dbl> <dbl> <dbl>
1 C0175A1 06810000… 3 10 0 0
2 C0022A1 06810000… 3 10 0 0
3 C0059E1 06810000… 3 10 0 0
4 UC101 06810000… 3 10 568. 1170.
5 FF00018 06810000… 3 10 2436. 2069.
6 FF00051 06810000… 3 10 1625. 1730.
7 185329 06810000… 3 10 0 0
8 C0028A1 06810000… 3 10 0 0
9 C0112A1 06810000… 3 10 0 0
10 FF00004 06810000… 3 10 0 0
# ℹ 310 more rows
# ℹ 18 more variables: copies_per_swab_cv <dbl>, uid <chr>,
# qpcr_sample_type <chr>, sample_type <chr>, control_type <chr>,
# qpcr_sample_id <chr>, vibr_sample_id <chr>, pcr_plate_id <chr>,
# pcr_plate_nb <chr>, ext_lib_plate_nb <chr>, target <fct>, fluor <chr>,
# strain_group_qpcr <chr>, taxon_label <chr>, LBP <fct>, strain_id <fct>,
# strain_origin <fct>, biose_id <chr>
amplicon_file <- se_files[grepl("16S_agg", se_files)][1]
se_16S <- readRDS(amplicon_file)
se_16S <- check_se(se_16S)
se_16S# A SummarizedExperiment-tibble abstraction: 3,365,223 × 48
# Features=817 | Samples=4119 | Assays=counts, rel_ab
.feature .sample counts rel_ab uid exclude_sample total_counts sample_type
<chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <chr>
1 Lactobac… 068100… 6904 0.0933 0681… No 74008 Clinical s…
2 Lactobac… 068100… 0 0 0681… No 74008 Clinical s…
3 Gardnere… 068100… 8284 0.112 0681… No 74008 Clinical s…
4 Ca_Lachn… 068100… 0 0 0681… No 74008 Clinical s…
5 Sneathia… 068100… 21143 0.286 0681… No 74008 Clinical s…
6 Fannyhes… 068100… 1601 0.0216 0681… No 74008 Clinical s…
7 Sneathia… 068100… 0 0 0681… No 74008 Clinical s…
8 Prevotel… 068100… 1963 0.0265 0681… No 74008 Clinical s…
9 Prevotel… 068100… 0 0 0681… No 74008 Clinical s…
10 f_Dethio… 068100… 3629 0.0490 0681… No 74008 Clinical s…
# ℹ 40 more rows
# ℹ 40 more variables: control_type <chr>, sample_id <chr>,
# sample_id_barcode <chr>, patient_id <chr>, visit_code_ps <chr>,
# idt_adapter_name <chr>, i5_index_forward <chr>, i5_index_reverse <chr>,
# i7_index_reverse <chr>, i7_index <chr>, sample_id_2 <chr>,
# total_spike_in_rel <dbl>, total_spike_in_counts <dbl>, cst <chr>,
# sub_cst <chr>, score <dbl>, specimen_type <chr>, well_id <chr>, …
luminex_file <- se_files[grepl("luminex_agg", se_files)][1]
se_luminex <- readRDS(luminex_file)
se_luminex <- check_se(se_luminex)
se_luminex# A SummarizedExperiment-tibble abstraction: 40,800 × 37
# Features=48 | Samples=850 | Assays=obs_conc, obs_conc_imp, value_type,
# value_types, sd_log10_obs_conc_imp, obs_conc_log10, obs_conc_imp_log10
.feature .sample obs_conc obs_conc_imp value_type value_types
<chr> <chr> <dbl> <dbl> <chr> <chr>
1 CTACK 068100004_0000 207. 207. Observed concentr… Observed c…
2 Eotaxin 068100004_0000 90.7 90.7 Observed concentr… Observed c…
3 FGF basic 068100004_0000 526. 526. Observed concentr… Observed c…
4 G-CSF 068100004_0000 767. 767. Observed concentr… Observed c…
5 GM-CSF 068100004_0000 64.2 64.2 Observed concentr… Observed c…
6 GRO-a 068100004_0000 3001. 3001. Observed concentr… Observed c…
7 HGF 068100004_0000 3805. 3805. Observed concentr… Observed c…
8 IFN-a2 068100004_0000 328. 328. Observed concentr… Observed c…
9 IFN-g 068100004_0000 4074. 4074. Observed concentr… Observed c…
10 IL-10 068100004_0000 33.2 33.2 Observed concentr… Observed c…
# ℹ 38 more rows
# ℹ 31 more variables: sd_log10_obs_conc_imp <dbl>, obs_conc_log10 <dbl>,
# obs_conc_imp_log10 <dbl>, uid <chr>, sample_id <chr>, visit <dbl>,
# weight <dbl>, volume <dbl>, approx_volume_based_on_aliquot_repos <dbl>,
# aliquot_volume <dbl>, dilution_200ul_tube <dbl>,
# volume_added_for_assay <dbl>, dilution_manifest <dbl>, dilution_new <dbl>,
# shorthand_title <chr>, alternate_title <chr>, …
flow_file <- se_files[grepl("flow", se_files)][1]
se_flow <- readRDS(flow_file)
se_flow <- check_se(se_flow)
se_flow# A SummarizedExperiment-tibble abstraction: 3,916 × 12
# Features=11 | Samples=356 | Assays=count, percentage
.feature .sample count percentage uid sample machine sample_type
<chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 Live 068100004_10… 2.16e6 NA 0681… Speci… 5R Clinical s…
2 Granulocytes 068100004_10… 2.98e4 1.38 0681… Speci… 5R Clinical s…
3 B_cells 068100004_10… 3.68e2 NA 0681… Speci… 5R Clinical s…
4 APC 068100004_10… 7.88e2 NA 0681… Speci… 5R Clinical s…
5 CD40 068100004_10… 4.06e2 51.5 0681… Speci… 5R Clinical s…
6 CD80 068100004_10… 3.82e2 48.5 0681… Speci… 5R Clinical s…
7 CD86 068100004_10… 4.92e2 62.4 0681… Speci… 5R Clinical s…
8 CD4 068100004_10… 4.13e3 NA 0681… Speci… 5R Clinical s…
9 CD8 068100004_10… 1.48e3 NA 0681… Speci… 5R Clinical s…
10 DblePosCD4 068100004_10… 1.58e2 3.82 0681… Speci… 5R Clinical s…
# ℹ 210 more rows
# ℹ 4 more variables: control_type <chr>, cell_type <chr>,
# parent_cell_type <fct>, cell_type_label <fct>
MultiAssayExperiment objectassay_list <-
bind_rows(
tibble(
mae_assay_name = "mg",
se_object_name = "se_mg",
assay_label = "Metagenomic"
),
tibble(
mae_assay_name = "qPCR",
se_object_name = "se_qPRC",
assay_label = "qPCR weekly swabs"
),
tibble(
mae_assay_name = "qPCR_daily",
se_object_name = "se_daily_qPRC",
assay_label = "qPCR home swabs"
),
tibble(
mae_assay_name = "amplicon",
se_object_name = "se_16S",
assay_label = "16S rRNA amplicon seq."
),
tibble(
mae_assay_name = "luminex",
se_object_name = "se_luminex",
assay_label = "Luminex"
),
tibble(
mae_assay_name = "flow",
se_object_name = "se_flow",
assay_label = "Flow cytometry"
)
) |>
mutate(
assay_label = assay_label |> fct_inorder()
)We will assemble the following tables (SE objects):
assay_list |> gt()| mae_assay_name | se_object_name | assay_label |
|---|---|---|
| mg | se_mg | Metagenomic |
| qPCR | se_qPRC | qPCR weekly swabs |
| qPCR_daily | se_daily_qPRC | qPCR home swabs |
| amplicon | se_16S | 16S rRNA amplicon seq. |
| luminex | se_luminex | Luminex |
| flow | se_flow | Flow cytometry |
mae_coldata_assays tablesavailable_samples <-
map(
1:nrow(assay_list),
function(i, assay_list){
se <- get(assay_list$se_object_name[i])
tibble(
assay = assay_list$mae_assay_name[i],
se_colname = se |> colnames(),
uid = se$uid,
sample_type = se$sample_type,
control_type = se$control_type,
)
},
assay_list = assay_list
) |>
bind_rows()
if (!all(available_samples$se_colname == available_samples$uid, na.rm = TRUE)) {
stop("The uid column in the SE objects does not match their colnames")
}
mae_coldata_assays <-
available_samples |>
group_by(uid) |>
summarize(
n_sample_type = n_distinct(sample_type),
n_control_type = n_distinct(control_type),
sample_type = sample_type[1],
control_type = control_type[1],
assays = str_c(assay, collapse = ", ")
)
if (any(mae_coldata_assays$n_sample_type > 1)) {
warning("Some samples have multiple sample types")
}
if (any(mae_coldata_assays$n_control_type > 1)) {
warning("Some samples have multiple control types")
}
mae_coldata_assays <-
mae_coldata_assays |>
select(uid, sample_type, control_type, assays) The mae_coldata_assays table contains the following columns:
mae_coldata_assays |> colnames()[1] "uid" "sample_type" "control_type" "assays"
mae_coldata_assays |> str()tibble [4,185 × 4] (S3: tbl_df/tbl/data.frame)
$ uid : chr [1:4185] "068100004_0000" "068100004_1000" "068100004_1001" "068100004_1002" ...
$ sample_type : chr [1:4185] "Clinical sample" "Clinical sample" "Clinical sample" "Clinical sample" ...
$ control_type: chr [1:4185] "" "" "" "" ...
$ assays : chr [1:4185] "mg, qPCR, amplicon, luminex" "mg, qPCR, amplicon, luminex, flow" "qPCR_daily, amplicon" "qPCR_daily, amplicon" ...
mae_coldata_assays |>
dplyr::count(sample_type, control_type) |>
gt(caption = "Number of samples per sample type and control type") | sample_type | control_type | n |
|---|---|---|
| Clinical sample | 3962 | |
| Clinical sample (other study) | 7 | |
| Negative control | Nuclease-free water | 46 |
| Negative control | Unused swab + C2 | 47 |
| Positive control | Mock 1 | 57 |
| Positive control | Mock 2 | 57 |
| Test sample | 9 |
We also remove the columns sample_type and control_type from the colData of the individual assays to avoid conflicts when joining with the mae@colData table in downstream analyses.
for (i in 1:nrow(assay_list)) {
se <- get(assay_list$se_object_name[i])
if (("sample_type" %in% colnames(colData(se))) | ("control_type" %in% colnames(colData(se)))) {
colData(se) <- colData(se)[, !colnames(colData(se)) %in% c("sample_type", "control_type")]
}
assign(assay_list$se_object_name[i], se)
}MAE@colData: joining the visits and mae_coldata_assays tablesWe first do a full join of the visits and mae_coldata_assays tables by uid. This will allow us to keep all visits, even those that do not have assay data, and to identify which visits are in the CRF data and which are in the assay data.
mae_coldata <-
# we first do a full join of the CRF visits and mae_coldata_assays by uids;
dplyr::full_join(
visits |> select(-starts_with("specimen_collection")) |> mutate(in_CRF = TRUE),
mae_coldata_assays |> mutate(in_assays = TRUE) |> select(in_assays, everything()),
by = join_by(uid)
) |>
mutate(
in_CRF = in_CRF |> replace_na(FALSE),
in_assays = in_assays |> replace_na(FALSE)
)# We notice that there are 5 "Clinical sample" entries for which we do have assay data but that did not appear in the CRF data:
mae_coldata |>
filter(is.na(visit_code), sample_type == "Clinical sample")
# we fix these belowmae_coldata <-
mae_coldata |>
mutate(
needs_fixing = is.na(pid) & (sample_type == "Clinical sample") & str_detect(uid, "[0-9]{9}_[0-9]{4}"),
pid =
case_when(
needs_fixing ~ str_remove(uid, "_[0-9]{4}"),
TRUE ~ pid
),
visit_code =
case_when(
needs_fixing ~ str_remove(uid, "[0-9]{9}_"),
TRUE ~ visit_code
),
study_day =
case_when(
needs_fixing & (visit_code %in% c("0001", "0011")) ~ -7,
needs_fixing & in_assays & (visit_code == "1000") ~ 0,
TRUE ~ study_day
),
visit_attended =
case_when(
is.na(visit_attended) & needs_fixing ~ TRUE,
TRUE ~ visit_attended
),
visit_planned =
case_when(
is.na(visit_planned) & needs_fixing ~ "Unplanned visit",
TRUE ~ visit_planned
),
visit_type =
case_when(
is.na(visit_type) & needs_fixing ~ "Clinic",
TRUE ~ visit_type
)
) We then merge that table with the participants table to add the participant-invariant information (e.g., randomized, mITT, etc.) to the mae_coldata table.
# we add the participant-invariant information from the `participants` table
mae_coldata <-
mae_coldata |>
dplyr::left_join(
participants,
by = join_by(pid)
)mae_coldata <- mae_coldata |> select(-needs_fixing)mae_coldata |>
dplyr::count(in_CRF, randomized, in_assays, sample_type, assays) |>
gt()| in_CRF | randomized | in_assays | sample_type | assays | n |
|---|---|---|---|---|---|
| FALSE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon, flow | 2 |
| FALSE | FALSE | TRUE | Clinical sample | qPCR_daily, amplicon | 1 |
| FALSE | NA | TRUE | Clinical sample (other study) | mg, qPCR | 7 |
| FALSE | NA | TRUE | Negative control | amplicon | 70 |
| FALSE | NA | TRUE | Negative control | mg, qPCR | 12 |
| FALSE | NA | TRUE | Negative control | qPCR | 11 |
| FALSE | NA | TRUE | Positive control | amplicon | 91 |
| FALSE | NA | TRUE | Positive control | mg | 1 |
| FALSE | NA | TRUE | Positive control | mg, qPCR | 10 |
| FALSE | NA | TRUE | Positive control | qPCR | 12 |
| FALSE | NA | TRUE | Test sample | qPCR | 8 |
| FALSE | NA | TRUE | Test sample | qPCR_daily | 1 |
| TRUE | FALSE | FALSE | NA | NA | 414 |
| TRUE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon | 66 |
| TRUE | FALSE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex | 5 |
| TRUE | FALSE | TRUE | Clinical sample | qPCR, amplicon | 1 |
| TRUE | TRUE | FALSE | NA | NA | 864 |
| TRUE | TRUE | TRUE | Clinical sample | amplicon | 4 |
| TRUE | TRUE | TRUE | Clinical sample | flow | 1 |
| TRUE | TRUE | TRUE | Clinical sample | luminex, flow | 1 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon | 49 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, flow | 3 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex | 491 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, amplicon, luminex, flow | 346 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, luminex | 2 |
| TRUE | TRUE | TRUE | Clinical sample | mg, qPCR, qPCR_daily, amplicon, luminex, flow | 2 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR, amplicon, luminex | 1 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR, amplicon, luminex, flow | 1 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR_daily, amplicon | 2985 |
| TRUE | TRUE | TRUE | Clinical sample | qPCR_daily, amplicon, luminex | 1 |
study_day for daily visit_code that did not have CRF dataThere were a few entries with daily visit_code that did not have a date in the CRF data. Consequently they also did not have study days (study_day) assigned.
mae_coldata |>
filter(
is.na(study_day),
is.na(sample_type) | (sample_type == "Clinical sample")
) |>
dplyr::count(visit_type, assays, in_CRF, visit_attended, sample_type)# A tibble: 5 × 6
visit_type assays in_CRF visit_attended sample_type n
<chr> <chr> <lgl> <lgl> <chr> <int>
1 Clinic <NA> TRUE FALSE <NA> 38
2 Home qPCR_daily, amplicon TRUE FALSE Clinical sample 100
3 Home qPCR_daily, amplicon TRUE TRUE Clinical sample 3
4 Home <NA> TRUE FALSE <NA> 540
5 Home <NA> TRUE TRUE <NA> 7
We impute the missing values using the study_days from “surrounding” visits.
# first, we create a "dictionary" with the most common study_days for each daily swab visit code
study_days_for_daily_codes <-
mae_coldata |>
dplyr::filter(visit_type == "Home") |>
dplyr::count(visit_code, study_day) |>
arrange(visit_code, -n) |>
group_by(visit_code) |>
slice_head(n = 1) |>
ungroup() |>
dplyr::rename(exp_study_day = study_day) |>
select(-n)study_days_for_daily_codes |>
ggplot(aes(x = exp_study_day, y = visit_code)) +
geom_point()# In the plot above, we notice that visits 1501:1503 did not have "expected" study day, so we create it by expanding the time series
study_days_for_daily_codes <-
study_days_for_daily_codes |>
mutate(
exp_study_day =
case_when(
is.na(exp_study_day) & (visit_code %in% c(1501:1503)) ~
as.numeric(visit_code) - 1501 + 36,
TRUE ~ exp_study_day
)
)# visits from the participants with missing study_days
pid_with_missing_daily_study_days <-
# we first filter to retrieve the pids
mae_coldata |>
filter(is.na(study_day)) |> # , in_assays
select(pid) |> distinct() |>
# we join with the coldata
left_join(mae_coldata, by = join_by(pid)) |>
arrange(pid, visit_code) |>
# we join the study_day dictionary
left_join(study_days_for_daily_codes) |>
mutate(diff = study_day - exp_study_day) |> # the typical difference for a participant between their study_day and the "expected" study_day (some participant started sampling at home one day early or late)
group_by(pid) |>
fill(diff, .direction = "downup") |> # impute when missing
mutate(diff = ifelse(all(is.na(diff)), 0, diff)) |> # if still missing,we set to 0
ungroup() |>
# we fix the study_day
mutate(
fixed_study_day =
case_when(
is.na(study_day) & !is.na(exp_study_day) ~ exp_study_day + diff,
TRUE ~ study_day
)
) |>
select(uid, pid, visit_code, study_day, fixed_study_day, exp_study_day, diff, in_CRF, in_assays, crf_plates, assays)
# pid_with_missing_daily_study_days |> View()mae_coldata <-
mae_coldata |>
left_join(
pid_with_missing_daily_study_days |> select(uid, fixed_study_day),
by = join_by(uid)
) |>
mutate(
study_day =
case_when(
is.na(study_day) ~ fixed_study_day,
TRUE ~ study_day
)
) |>
select(-fixed_study_day)mae_coldata |>
filter(is.na(study_day), in_assays) |>
View()
# all is fixedvisit, visit_number, and study_week columnsvisit_dict <-
bind_rows(
tibble(
visit = "Screening",
definition = "Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one.",
possible_visit_codes = "0000, 0001, 0010, 0011",
visit_number = "V0",
study_week = -1
),
tibble(
visit = "Add. screening",
definition = 'If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit.',
possible_visit_codes = "0000, 0001, 0010, 0011",
visit_number = "V0b",
study_week = -2
),
tibble(
visit = "Day 0 (pre-MTZ)",
definition = "Visit before the metronidazole treatment (MTZ) starts.",
possible_visit_codes = "1000, 1011",
visit_number = "V1",
study_week = 0
),
tibble(
visit = "Week 1 (post-MTZ)",
definition = "Visit after the metronidazole treatment (MTZ) ends.",
possible_visit_codes = "1100",
visit_number = "V2",
study_week = 1
),
tibble(
visit = "Week 2 (post-SP)",
definition = "Visit after the study product (SP) ends.",
possible_visit_codes = "1200, 1211",
visit_number = "V3",
study_week = 2
),
tibble(
visit = "Week 3",
definition = "Visit one week after the study product (SP) ends.",
possible_visit_codes = "1300",
visit_number = "V4",
study_week = 3
),
tibble(
visit = "Week 4",
definition = "Visit two weeks after the study product (SP) ends.",
possible_visit_codes = "1400",
visit_number = "V5",
study_week = 4
),
tibble(
visit = "Week 5",
definition = "Visit three weeks after the study product (SP) ends.",
possible_visit_codes = "1500, 1511, 1512",
visit_number = "V6",
study_week = 5
),
tibble(
visit = "Week 7",
definition = "Visit five weeks after the study product (SP) ends.",
possible_visit_codes = "1700, 1711",
visit_number = "V7",
study_week = 7
),
tibble(
visit = "Week 9",
definition = "Visit seven weeks after the study product (SP) ends.",
possible_visit_codes = "1900, 1911",
visit_number = "V8",
study_week = 9
),
tibble(
visit = "Week 12",
definition = "Visit ten weeks after the study product (SP) ends.",
possible_visit_codes = "2120, 2121, 2122",
visit_number = "V9",
study_week = 12
)
) |>
mutate(
visit = visit |> fct_reorder(study_week)
)visit_dict |>
gt() |>
tab_header(
title = "Visit codes and definitions"
) |>
cols_label(
visit = "Visit",
definition = "Definition",
possible_visit_codes = "Possible visit codes",
visit_number = "Visit number",
study_week = "Study week"
) |>
cols_align(
align = "left", columns = c(visit, definition, possible_visit_codes)
)| Visit codes and definitions | ||||
|---|---|---|---|---|
| Visit | Definition | Possible visit codes | Visit number | Study week |
| Screening | Screening visit for which we have the most assay data. If two screening visits have the same number of assays, we take the last one. | 0000, 0001, 0010, 0011 | V0 | -1 |
| Add. screening | If a participant has data for two screening visits, one is the "main" screening visit, and the other is labelled as an additional screening visit. | 0000, 0001, 0010, 0011 | V0b | -2 |
| Day 0 (pre-MTZ) | Visit before the metronidazole treatment (MTZ) starts. | 1000, 1011 | V1 | 0 |
| Week 1 (post-MTZ) | Visit after the metronidazole treatment (MTZ) ends. | 1100 | V2 | 1 |
| Week 2 (post-SP) | Visit after the study product (SP) ends. | 1200, 1211 | V3 | 2 |
| Week 3 | Visit one week after the study product (SP) ends. | 1300 | V4 | 3 |
| Week 4 | Visit two weeks after the study product (SP) ends. | 1400 | V5 | 4 |
| Week 5 | Visit three weeks after the study product (SP) ends. | 1500, 1511, 1512 | V6 | 5 |
| Week 7 | Visit five weeks after the study product (SP) ends. | 1700, 1711 | V7 | 7 |
| Week 9 | Visit seven weeks after the study product (SP) ends. | 1900, 1911 | V8 | 9 |
| Week 12 | Visit ten weeks after the study product (SP) ends. | 2120, 2121, 2122 | V9 | 12 |
We first determine, for each participant, which is their main vs. their additional screening visit.
mae_coldata <-
mae_coldata |>
group_by(pid) |>
mutate(
is_screening_visit =
case_when(
visit_code %in% c("0000", "0001", "0010", "0011") ~ TRUE,
TRUE ~ FALSE
),
n_assays = ifelse(in_assays, str_split(assays, ", ") |> lengths(), 0),
screening_visit_score = is_screening_visit * (n_assays + study_day/10)
) |>
arrange(-screening_visit_score) |>
mutate(
screening_visit =
case_when(
is_screening_visit & (row_number() == 1) ~ "main screening",
is_screening_visit & (row_number() > 1) ~ "additional screening",
TRUE ~ NA_character_
)
) |>
ungroup() |>
select(-is_screening_visit, -n_assays, -screening_visit_score)mae_coldata |>
dplyr::filter(screening_visit == "additional screening") |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
View()
mae_coldata |>
dplyr::filter(screening_visit == "additional screening") |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
dplyr::filter(randomized, (screening_visit == "main screening"), study_day == 0) |>
select(pid) |> distinct() |>
left_join(mae_coldata) |>
arrange(randomized |> desc(), pid, visit_code) |>
select(randomized, mITT, pid, visit_code, study_day, crf_plates, assays, screening_visit) |>
View()
# > check 068200020
# unclear - her main screening visit could be at day -3, but it might be at day 0, which would mean that her screening visit and pre-MTZ visit samples were collected on the same day
crf_raw$crf35 |> dplyr::filter(pid == "068200020") |> View()mae_coldata |>
filter(!is.na(screening_visit)) |>
dplyr::count(screening_visit) # A tibble: 2 × 2
screening_visit n
<chr> <int>
1 additional screening 66
2 main screening 532
We next join the mae_coldata table with the visit_dict table to add the visit, visit_number, and study_week columns.
mae_coldata <-
mae_coldata |>
select(-any_of(c("PIPV", "visit", "visit_number", "study_week"))) |>
left_join(
visit_dict |>
mutate(visit_code = possible_visit_codes |> str_split(", ")) |>
unnest(visit_code) |>
mutate(
screening_visit =
case_when(
visit_number == "V0b" ~ "additional screening",
visit_number == "V0" ~ "main screening",
TRUE ~ NA_character_
),
PIPV = visit_number != "V0b"
) |>
select(visit_code, screening_visit, PIPV, visit, visit_number, study_week)
) |>
mutate(PIPV = PIPV |> replace_na(FALSE)) |>
relocate(
PIPV, visit, visit_number, study_week, .after = "visit_code"
) We check that study_days are unique for each visit_number at which assay samples were collected for each participant.
mae_coldata |>
group_by(pid, visit_number) |>
mutate(n_study_day = n_distinct(study_day[!is.na(assays)])) |>
ungroup() |>
filter(PIPV, n_study_day > 1) |>
select(uid, pid, visit_code, study_day, assays) |>
group_by(pid) |>
gt(row_group_as_column = TRUE)| uid | visit_code | study_day | assays | |
|---|---|---|---|---|
| 068200062 | 068200062_1500 | 1500 | 37 | mg, qPCR, amplicon |
| 068200062_1511 | 1511 | 44 | luminex, flow | |
| 068200087 | 068200087_1500 | 1500 | 40 | mg, qPCR, amplicon, luminex |
| 068200087_1511 | 1511 | 43 | flow |
There are just two participants for which the dates don’t match and the different assays were collected at different dates. This may be important for integrative analyses later, so we leave as is for now.
specimen_collection_swab (collected, not collected, unclear)specimen_collection_softcup (collected, not collected, unclear)specimen_collection_cytobrush (collected, not collected, unclear)mgqPCRampliconluminexflowsample_and_visit_info_assay <-
mae_coldata |>
select(uid, sample_type, in_CRF, in_assays, randomized)
sample_and_visit_info_assay <-
sample_and_visit_info_assay |>
dplyr::left_join(
visits |> select(uid, starts_with("specimen_collection")),
by = join_by(uid)
)
sample_and_visit_info_assay <-
sample_and_visit_info_assay |>
expand_grid(assay = assay_list$mae_assay_name) |>
dplyr::left_join(
available_samples |>
select(uid, assay) |>
mutate(value = TRUE),
by = join_by(uid, assay)
) |>
mutate(value = value |> replace_na(FALSE)) |>
pivot_wider(names_from = assay, values_from = value)
# sample_and_visit_info_assay <-
# sample_and_visit_info_assay |>
# mutate(
# mg_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & mg ~ "Not a clinical sample",
# is.na(randomized) & mg ~ "???",
# !is.na(randomized) & !randomized & mg ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & mg ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & mg ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !mg ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !mg ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# qPCR_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & qPCR ~ "Not a clinical sample",
# is.na(randomized) & qPCR ~ "???",
# !is.na(randomized) & !randomized & qPCR ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & qPCR ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & qPCR ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !qPCR ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !qPCR ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# amplicon_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & amplicon ~ "Not a clinical sample",
# is.na(randomized) & amplicon ~ "???",
# !is.na(randomized) & !randomized & amplicon ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & amplicon ~ "Expected data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & amplicon ~ "Unexpected data",
# !is.na(specimen_collection_swab) & (specimen_collection_swab > 0) & !amplicon ~ "Missing data",
# (is.na(specimen_collection_swab) | (specimen_collection_swab == 0)) & !amplicon ~ "Sample not collected",
# TRUE ~ "ERROR"
# ),
# luminex_category =
# case_when(
# !str_detect(sample_type, "Clinical sample") & luminex ~ "Not a clinical sample",
# is.na(randomized) & luminex ~ "???",
# !is.na(randomized) & !randomized & luminex ~ "Additional sample from non-randomized participants",
# !is.na(specimen_collection_softcup) & (specimen_collection_softcup != "not collected") & luminex ~ "Expected data",
# (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & luminex ~ "Unexpected data",
# !is.na(specimen_collection_softcup) & (specimen_collection_softcup == "collected") & !luminex ~ "Missing data",
# (is.na(specimen_collection_softcup) | (specimen_collection_softcup == "not collected")) & !luminex ~ "Sample not collected",
# TRUE ~ "ERROR"
# )
# )
#
# The last 5 columns can take the following values:
#
# - `expected` (swab collected and available data),
# - `unexpected` (swab not collected but available data),
# - `missing` (swab collected but no available data),
# - `maybe missing` (unclear if swab was collected and no available data),
# - `not collected` (swab not collected),
# - `additional` (swab collected from a non-randomized participant),
# - `NA` (not collected, no available data)
# se_sample_and_visit_info_assay <-
SummarizedExperiment(
assays = list(
sample_and_visit_info =
sample_and_visit_info_assay |>
select(-c(sample_type, in_CRF, in_assays, randomized)) |>
as.data.frame() |>
column_to_rownames("uid") |>
t()
),
colData =
sample_and_visit_info_assay |> select(uid) |>
mutate(rownames = uid) |> as.data.frame() |>
column_to_rownames("rownames")
)MultiAssayExperiment objectSE_list <-
list(
visit_and_sample_info = se_sample_and_visit_info_assay,
mg = se_mg,
qPCR = se_qPRC,
qPCR_daily = se_daily_qPRC,
amplicon = se_16S,
luminex = se_luminex,
flow = se_flow
)
mae <-
MultiAssayExperiment::MultiAssayExperiment(
experiments = MultiAssayExperiment::ExperimentList(SE_list),
colData =
mae_coldata |> select(-in_CRF, -in_assays) |>
as.data.frame() |> mutate(rownames = uid) |>
column_to_rownames("rownames"),
metadata = list(
study = "VIBRANT",
data_source = "actual study data (not simulated)",
data_status = "partially QCed",
date = today(),
colData_dictionary = participants_variable_dictionary,
crf_plates_description = crf_plates_dictionary,
crf_data_raw = crf_raw,
crf_data_clean = crf_clean,
participant_crfs_merged = participant_crfs_merged,
visits_crfs_merged = visits_crfs_merged,
exposures = exposures
)
)maeA MultiAssayExperiment object of 7 listed
experiments with user-defined names and respective classes.
Containing an ExperimentList class object of length 7:
[1] visit_and_sample_info: SummarizedExperiment with 9 rows and 5463 columns
[2] mg: SummarizedExperiment with 779 rows and 996 columns
[3] qPCR: SummarizedExperiment with 16 rows and 1029 columns
[4] qPCR_daily: SummarizedExperiment with 16 rows and 2990 columns
[5] amplicon: SummarizedExperiment with 817 rows and 4119 columns
[6] luminex: SummarizedExperiment with 48 rows and 850 columns
[7] flow: SummarizedExperiment with 11 rows and 356 columns
Functionality:
experiments() - obtain the ExperimentList instance
colData() - the primary/phenotype DataFrame
sampleMap() - the sample coordination DataFrame
`$`, `[`, `[[` - extract colData columns, subset, or experiment
*Format() - convert into a long or wide DataFrame
assays() - convert ExperimentList to a SimpleList of matrices
exportClass() - save data to flat files
mae_sample_map <-
mae@sampleMap |> as_tibble() |> dplyr::filter(assay %in% assay_list$mae_assay_name) |>
select(uid = primary, assay) |>
arrange(uid) |>
dplyr::left_join(assay_list |> dplyr::rename(assay = mae_assay_name), by = join_by(assay)) |>
dplyr::right_join(
mae@colData |> as.data.frame() |> as_tibble() |>
select(uid, pid, visit_code, PIPV, visit, visit_type, sample_type, location, randomized, arm),
by = join_by(uid)
)All samples
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
group_by(uid) |> mutate(n = n()) |> ungroup() |>
mutate(uid = uid |> fct_reorder(-n)) |>
ggplot() +
aes(x = uid, y = assay_label |> fct_rev(), fill = assay_label) +
geom_tile() +
facet_grid(. ~ sample_type, scales = "free_x", space = "free_x") +
scale_x_discrete("Sample IDs", breaks = NULL) +
ylab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)Longitudinal availability for all visits
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
# filter(visit_code %in% c("0000", seq(1000, 1900, by = 100), "2120")) |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
ifelse(randomized, "Randomized", "Not randomized") + location ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)For the weekly visits of all participants
mae_sample_map |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(visit_type == "Clinic") |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + ifelse(randomized, "Randomized", "Not randomized") + arm ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)For the weekly visits of randomized participants
mae_sample_map |>
dplyr::filter(randomized) |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(visit_type == "Clinic") |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + ifelse(randomized, "Randomized", "Not randomized") + arm ~ visit_code,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)mae_sample_map |>
dplyr::filter(randomized) |>
dplyr::filter(!is.na(assay)) |>
dplyr::filter(PIPV) |>
ggplot() +
aes(x = assay_label, y = pid, fill = assay_label) +
geom_tile() +
facet_grid(
location + arm ~ visit,
scales = "free_y", space = "free_y"
) + # sample_category
scale_y_discrete("Participants IDs") + #, breaks = "NULL") +
xlab("") +
scale_fill_brewer("Assay", type = "qual", palette = 7) + # 3
theme(
axis.text.x = element_blank(), #element_text(angle = 45, hjust = 1),
strip.text.y = element_text(angle = 0),
strip.text.x = element_text(angle = 90),
legend.position = "bottom"
)MultiAssayExperiment objectsWe export the full MultiAssayExperiment object as an .rds file for further analyses.
saveRDS(
mae,
str_c(
get_02_output_dir(),
"mae_full_", today() |> str_remove_all("-"), ".rds"
)
)